# -*- coding: utf-8 -*-
"""Q-Learning.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1iNODBaJyYGyuWpEHmQ5fzmgYcv_PJu60
"""

import numpy as np
import random

# Define the environment
class GridWorld:
    def __init__(self):
        self.grid = np.array([
            [0, 0, 0, 1],
            [0, -1, 0, -1],
            [0, 0, 0, 0]
        ])
        self.rows = 3
        self.cols = 4
        self.state = (2, 0)  # Start position

    def reset(self):
        self.state = (2, 0)
        return self.state

    def step(self, action):
        # 0: up, 1: right, 2: down, 3: left
        row, col = self.state

        if action == 0: row = max(row - 1, 0)
        elif action == 1: col = min(col + 1, self.cols - 1)
        elif action == 2: row = min(row + 1, self.rows - 1)
        elif action == 3: col = max(col - 1, 0)

        self.state = (row, col)
        reward = self.grid[row, col]
        done = (reward != 0)  # Episode ends when reaching +1 or -1

        return self.state, reward, done

# Q-Learning parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate
episodes = 1000

# Initialize Q-table
env = GridWorld()
q_table = np.zeros((env.rows, env.cols, 4))  # (rows, cols, actions)

# Q-Learning algorithm
for episode in range(episodes):
    state = env.reset()
    done = False

    while not done:
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, 3)  # Explore
        else:
            action = np.argmax(q_table[state[0], state[1]])  # Exploit

        # Take action and observe result
        next_state, reward, done = env.step(action)

        # Q-value update
        old_value = q_table[state[0], state[1], action]
        next_max = np.max(q_table[next_state[0], next_state[1]])

        new_value = old_value + alpha * (reward + gamma * next_max - old_value)
        q_table[state[0], state[1], action] = new_value

        state = next_state

# Print the learned Q-table
print("Learned Q-table:")
for i in range(env.rows):
    for j in range(env.cols):
        print(f"State ({i},{j}): {q_table[i, j]}")

# Test the learned policy
state = env.reset()
done = False
print("\nTesting learned policy:")
while not done:
    action = np.argmax(q_table[state[0], state[1]])
    state, reward, done = env.step(action)
    print(f"Action: {action}, New State: {state}, Reward: {reward}")